HW 01

Author

Nandakumar Kuthalaraja

1 0 - Setup

if (!require("pacman")) 
  install.packages("pacman")
Loading required package: pacman
pacman::p_load(tidyverse, glue, scales, ggthemes, openintro, ggrepel)

devtools::install_github("tidyverse/dsbox")
Using GitHub PAT from the git credential store.
Skipping install of 'dsbox' from a github remote, the SHA1 (244ecdfe) has not changed since last install.
  Use `force = TRUE` to force installation
# set theme for ggplot2
ggplot2::theme_set(ggplot2::theme_minimal(base_size = 14))

# set width of code output
options(width = 65)

# set figure parameters for knitr
knitr::opts_chunk$set(
  fig.width = 7,        # 7" width
  fig.asp = 0.618,      # the golden ratio
  fig.retina = 3,       # dpi multiplier for displaying HTML output on retina
  fig.align = "center", # center align figures
  dpi = 300             # higher dpi, sharper image
)



###All responses are in comments within the code****

2 1 - Road traffic accidents in Edinburgh

# Reading the Dataset
accidents <- read_csv("data/accidents.csv")
Rows: 768 Columns: 31
── Column specification ─────────────────────────────────────────
Delimiter: ","
chr  (18): id, severity, date, day_of_week, highway, first_ro...
dbl  (12): easting, northing, longitude, latitude, police_for...
time  (1): time

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
accidents <- accidents %>%
  mutate(
    day_type = if_else(
    wday(date) == 1 | wday(date) == 7,  # Sunday (1) or Saturday (7)
    "Weekend",
    "Weekday"
  ))

ggplot(accidents, aes(x = time, fill = severity)) +
  geom_density(alpha = 0.7) +
  #  facet_wrap(~ day_type) +
  facet_wrap(~day_type, ncol = 1) +
  scale_fill_manual(values = c("Fatal" = "gray", "Serious" = "deepskyblue3", "Slight" = "yellow3")) + # Closest color I can get by
  scale_color_colorblind() +
  scale_x_time(
    breaks = hms::as_hms(c("00:00:00", "04:00:00", "08:00:00", "12:00:00", "16:00:00", "20:00:00", "24:00:00"))
  ) +
  labs(
    title = "Number of accidents throughout the day",
    subtitle = "By day of week and severity",
    x = "Time of day",
    y = "Density",
    fill = "Severity"
  ) +
  theme(
    legend.position = "right"
  )

Interpretation:

Plot shows the distribution of accidents categorized into 3 severity and plotted in 2 charts covering weekend & weekdays

3 2 - NYC marathon winners

3.1 (a)

# Reading the Dataset
marathon <- read_csv("data/nyc_marathon.csv")
Rows: 102 Columns: 7
── Column specification ─────────────────────────────────────────
Delimiter: ","
chr  (4): name, country, division, note
dbl  (2): year, time_hrs
time (1): time

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# (a)
  
  marathon_a <- marathon %>%
    mutate(time_seconds = period_to_seconds(hms(time)))  %>%
    filter(is.finite(time_seconds))
  
  #Plot histogram
  ggplot(marathon_a, aes(x = time_seconds / 60)) +  
    geom_histogram(binwidth = 10, fill = "green4", color = "black") +
    labs(
      title = "Distribution of Finish Times",
      x = "Finish Time",
      y = "Number of Runners"
    ) 

  #Plot Boxplot
  ggplot(marathon_a, aes(x = time_seconds / 60)) +
  geom_boxplot(fill = "lightgreen", color = "green4", outlier.color = "black") +
  labs(
    title = "Boxplot of Finish Times",
    x = "Finish Time (minutes)"
  )

  ### Boxplot has ability to show outliers & medians effectively, while histogram shows more detailed distribution of data as towers

3.2 (b)

# (b)
  
  #Plot Boxplot
  ggplot(marathon_a, aes(x = division, y = time_seconds / 60, fill = division)) +
  geom_boxplot(alpha = 0.6, color = "darkgray") +
  labs(
    title = "Finish Times by Gender",
    x = "Division",
    y = "Finish Time"
  ) +
  scale_fill_manual(values = c("Men" = "green4", "Women" = "orange"))

  ### Men has aggressive median Finish times than Women and Women runners are more staggered outliers compared to Men

3.3 (c)

# (c)
  
  ### Legend is redundant, removing it decrease amount of information in plot and makes less content and more clarity

  #Plot Boxplot
  ggplot(marathon_a, aes(x = division, y = time_seconds / 60, fill = division)) +
  geom_boxplot(alpha = 0.6, color = "darkgray") +
  labs(
    title = "Finish Times by Gender",
    x = "Division",
    y = "Finish Time"
  ) +
  scale_fill_manual(values = c("Men" = "green4", "Women" = "orange")) + theme(legend.position="none")

3.4 (d)

# (d)
  
  ggplot(marathon_a, aes(x = year, y = time_seconds / 60, color = division)) +
  geom_line(size = 1) +
  geom_point(alpha = 0.7) + scale_color_manual(
    values = c("Men" = "black", "Women" = "blue")
  )
Warning: Using `size` aesthetic for lines was deprecated in ggplot2
3.4.0.
ℹ Please use `linewidth` instead.

  labs(
    title = "Winning Times Over the Years",
    x = "Year",
    y = "Finish Time",
    color = "Division"
  ) 
$x
[1] "Year"

$y
[1] "Finish Time"

$colour
[1] "Division"

$title
[1] "Winning Times Over the Years"

attr(,"class")
[1] "labels"
  ### We can infer trends here compared to other plots

4 3 - US counties

4.1 (a)

### (a) Since X-axis is completely different for both, boxplot didn't even render... Don't think this snippet is using correct params

ggplot(county) +
  geom_point(aes(x = median_edu, y = median_hh_income)) +
  geom_boxplot(aes(x = smoking_ban, y = pop2017))
Warning: Removed 3 rows containing non-finite outside the scale range
(`stat_boxplot()`).
Warning: Removed 2 rows containing missing values or values outside the
scale range (`geom_point()`).

4.2 (b)

### (b) Plots with column facet grid looks visually clear and easy to comprehend, use row facet based grid when you want to avoid horizontal scrolling, while the other one when there are moderate number of variables for better clarity

ggplot(county %>% filter(!is.na(median_edu))) + 
  geom_point(aes(x = homeownership, y = poverty)) + 
  facet_grid(median_edu ~ .)

ggplot(county %>% filter(!is.na(median_edu))) + 
  geom_point(aes(x = homeownership, y = poverty)) + 
  facet_grid(. ~ median_edu)

4.3 (c)

### (c)

  ##Plot A
  ggplot(county, aes(x = homeownership, y = poverty)) +
  geom_point() +
  ggtitle("Plot A")
Warning: Removed 2 rows containing missing values or values outside the
scale range (`geom_point()`).

  ##Plot B
  ggplot(county, aes(x = homeownership, y = poverty)) +
  geom_point() +
  geom_smooth(method = "loess", se = FALSE, color = "blue") +
  ggtitle("Plot B")
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_smooth()`).
Removed 2 rows containing missing values or values outside the
scale range (`geom_point()`).

  ##Plot C
  ggplot(county, aes(x = homeownership, y = poverty)) +
  geom_point() +
  geom_smooth(method = "gam", se = FALSE, color = "green", size = 1.2) +
    geom_smooth(method = "glm", se = FALSE, color = "green", size = 1.2) +
  ggtitle("Plot C")
`geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_smooth()`).
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_smooth()`).
Removed 2 rows containing missing values or values outside the
scale range (`geom_point()`).

  ##Plot D
  ggplot(county, aes(x = homeownership, y = poverty)) +
  geom_point() +
  geom_smooth(data = subset(county, homeownership > 25 & homeownership < 85), se = FALSE, 
              method = "lm", color = "blue") +
    geom_smooth(data = subset(county, homeownership > 25 & homeownership < 85), se = FALSE, 
              method = "gam", color = "blue") +
  ggtitle("Plot D")
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_smooth()`).
`geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_smooth()`).
Removed 2 rows containing missing values or values outside the
scale range (`geom_point()`).

  ##Plot E
  ggplot(county, aes(x = homeownership, y = poverty, color = metro)) +
  geom_point() +
  geom_smooth(method = "lm", aes(linetype = metro), se = FALSE) +
  ggtitle("Plot E")
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_smooth()`).
Removed 2 rows containing missing values or values outside the
scale range (`geom_point()`).

  ##Plot F
  ggplot(county, aes(x = homeownership, y = poverty, color = metro)) +
  geom_point() +
  geom_smooth(method = "gam", se = FALSE) +
  ggtitle("Plot F")
`geom_smooth()` using formula = 'y ~ s(x, bs = "cs")'
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_smooth()`).
Removed 2 rows containing missing values or values outside the
scale range (`geom_point()`).

  ##Plot G
  ggplot(county, aes(x = homeownership, y = poverty)) +
  geom_point(aes(color = metro)) +
  geom_smooth(method = "loess", se = FALSE, color = "blue") +
  ggtitle("Plot G")
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_smooth()`).
Removed 2 rows containing missing values or values outside the
scale range (`geom_point()`).

  ##Plot H
  ggplot(county, aes(x = homeownership, y = poverty, color = metro)) +
  geom_point() +
  ggtitle("Plot H")
Warning: Removed 2 rows containing missing values or values outside the
scale range (`geom_point()`).

5 4 - Credit Card Balances

creditcard <- read_csv("data/credit.csv")
Rows: 400 Columns: 5
── Column specification ─────────────────────────────────────────
Delimiter: ","
chr (2): student, married
dbl (3): balance, income, limit

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
  ### (a)

  ggplot(creditcard, aes(x = income, y = balance)) +
  geom_point(aes(color = student, shape = student), alpha = 0.5) +
  geom_smooth(method = "lm", se = FALSE, aes(color = student)) +
  scale_color_manual(values = c("No" = "orange", "Yes" = "pink3")) +
  facet_grid(student ~ married, labeller = labeller(married = label_both,
    student = label_both)) +
  scale_x_continuous(labels = label_dollar(prefix="$", suffix = "k")) +
  scale_y_continuous(labels = label_dollar(prefix="$")) +
  labs(x = "Income", y = "Credit card balance") + 
  theme(
    strip.background = element_rect(fill = "gray90", color = "black"),
    strip.text = element_text(face = "bold"), legend.position = "none",
    panel.border = element_rect(color = "black", fill = NA)
  )
`geom_smooth()` using formula = 'y ~ x'

### As Income increases Credit Card Balance too increases. Married people tends to have higher balances.
  ### (b)

### Married Students tend to have considerably lesser balances and at the same time relatively higher salries are earned. ANd these variables are great in predicting the balance trends and helps in highlighting the clear differences
  ### (c)
  creditcard <- creditcard %>%
  mutate(utz = balance / limit)

  ggplot(creditcard, aes(x = income, y = utz)) +
  geom_point(aes(color = student, shape = student), alpha = 0.5) +
  geom_smooth(method = "lm", se = FALSE, aes(color = student)) +
  scale_color_manual(values = c("No" = "orange", "Yes" = "pink3")) +
  facet_grid(student ~ married, labeller = labeller(
    student = c("No" = "student: No", "Yes" = "student: Yes"),
    married = c("No" = "married: No", "Yes" = "married: Yes")
  )) +
  scale_y_continuous(labels = percent_format(accuracy = 1)) +
    scale_x_continuous(labels = label_dollar(prefix="$", suffix = "k")) +
  labs(
    x = "Income",
    y = "Credit utilization"
  ) +
  theme(
    strip.background = element_rect(fill = "gray90", color = "black"),
    strip.text = element_text(face = "bold"), legend.position = "none",
    panel.border = element_rect(color = "black", fill = NA)
  )
`geom_smooth()` using formula = 'y ~ x'

  ### (d)

### The relationship btw Income & Credit utilization is completely different from (b). Credit utilization by married is more compared to other group. This difference is good for interpreting the Credit behavior with various sets of people.

6 5 - Napoleon’s march.

### Reference from -- https://www.andrewheiss.com/blog/2017/08/10/exploring-minards-1812-plot-with-ggplot2/
### Detailed explanation below for this plot's snippets

### Load data into napoleon variable from the RDS file (RDS is an R Object file) which contains 3 components - troop movement, cities & temperature data 
napoleon <- read_rds("data/napoleon.rds")

### Extra respective data components to the variables
cities <- napoleon[[1]]
temp <- napoleon[[2]]
troops <- napoleon[[3]]

### Create new variable which will combine temp & date
temps.nice <- temp %>%
  mutate(nice.label = paste0(temp, "°, ", month, ". ", day))


###Instantiate the plot with geom function to draw line, which represent the path of troop movement... Colored by direction of the movement & sized by number of survivors. 
march.1812.plot.simple <- ggplot() +
  geom_path(data = troops, aes(x = long, y = lat, group = group,
                               color = direction, size = survivors),
            lineend = "round") +
  
  ### Adding cities as points in the plot as Orange color
  geom_point(data = cities, aes(x = long, y = lat),
             color = "orange") +
  
  ### Adding labels for Cities
  geom_text_repel(data = cities, aes(x = long, y = lat, label = city),
                  color = "orange") +
  
  ### Adjusts Thickness 
  scale_size(range = c(0.5, 10)) +
  
  ### Manually set color for the direction
  scale_colour_manual(values = c("blue", "black")) +
  
  ### Hide Legend (for ink to data ratio)
  guides(color = FALSE, size = FALSE) +
  

  ### Customized Labels
  labs(
      title = "Napoleon's Troop Campaign",
      subtitle = "Path and size of the army during the campaign",
      x = "Longitude",
      y = "Latitude",
      caption = "Data source: https://www.andrewheiss.com/blog/2017/08/10/exploring-minards-1812-plot-with-ggplot2/"
    )
Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use
"none" instead as of ggplot2 3.3.4.
### New PLot showing temp against longitude
temps.1812.plot <- ggplot(data = temps.nice, aes(x = long, y = temp)) +
  geom_line() +
  
  ### Formatted Labels
  geom_label(aes(label = nice.label), size = 2.5) +
  
  ### Add label to Y axis
  labs(x = NULL, y = "° Celsius") +
  
  ### Plot similar to match above plot of troop movement
  scale_x_continuous(limits = ggplot_build(march.1812.plot.simple)$layout$panel_ranges[[1]]$x.range) +
  
  ### Move label to Right
  scale_y_continuous(position = "right") +
  coord_cartesian(ylim = c(-35, 5)) + 
  
  ### Remove the grid components, borders & text at X axis
  theme(panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank(),
        panel.grid.minor.y = element_blank(),
        axis.text.x = element_blank(), axis.ticks = element_blank(),
        panel.border = element_blank())

### Combine 2 plots
both.1812.plot.simple <- rbind(ggplotGrob(march.1812.plot.simple),
                               ggplotGrob(temps.1812.plot))
panels <- both.1812.plot.simple$layout$t[grep("panel", both.1812.plot.simple$layout$name)]

### Adjust ratio of height for both plots acording to input
both.1812.plot.simple$heights[panels] <- unit(c(3, 1), "null")


### Final combined plot
grid::grid.newpage()
grid::grid.draw(both.1812.plot.simple)